In [1]:
#Import required libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns ; sns.set()
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, confusion_matrix
from statistics import mean
from sklearn.linear_model import LinearRegression, Ridge, Lasso,SGDClassifier
from sklearn.pipeline import Pipeline
In [2]:
#Read the csv file
data = pd.read_csv("Covid Dataset.csv")
print(len(data))
data.head()
5434
Out[2]:
Breathing Problem Fever Dry Cough Sore throat Running Nose Asthma Chronic Lung Disease Headache Heart Disease Diabetes ... Fatigue Gastrointestinal Abroad travel Contact with COVID Patient Attended Large Gathering Visited Public Exposed Places Family working in Public Exposed Places Wearing Masks Sanitization from Market COVID-19
0 Yes Yes Yes Yes Yes No No No No Yes ... Yes Yes No Yes No Yes Yes No No Yes
1 Yes Yes Yes Yes No Yes Yes Yes No No ... Yes No No No Yes Yes No No No Yes
2 Yes Yes Yes Yes Yes Yes Yes Yes No Yes ... Yes Yes Yes No No No No No No Yes
3 Yes Yes Yes No No Yes No No Yes Yes ... No No Yes No Yes Yes No No No Yes
4 Yes Yes Yes Yes Yes No Yes Yes Yes Yes ... No Yes No Yes No Yes No No No Yes

5 rows × 21 columns

In [3]:
type(data)
Out[3]:
pandas.core.frame.DataFrame
In [4]:
data.describe(include='all')
Out[4]:
Breathing Problem Fever Dry Cough Sore throat Running Nose Asthma Chronic Lung Disease Headache Heart Disease Diabetes ... Fatigue Gastrointestinal Abroad travel Contact with COVID Patient Attended Large Gathering Visited Public Exposed Places Family working in Public Exposed Places Wearing Masks Sanitization from Market COVID-19
count 5434 5434 5434 5434 5434 5434 5434 5434 5434 5434 ... 5434 5434 5434 5434 5434 5434 5434 5434 5434 5434
unique 2 2 2 2 2 2 2 2 2 2 ... 2 2 2 2 2 2 2 1 1 2
top Yes Yes Yes Yes Yes No No Yes No No ... Yes No No Yes No Yes No No No Yes
freq 3620 4273 4307 3953 2952 2920 2869 2736 2911 2846 ... 2821 2883 2983 2726 2924 2820 3172 5434 5434 4383

4 rows × 21 columns

In [5]:
data.nunique()
Out[5]:
Breathing Problem                          2
Fever                                      2
Dry Cough                                  2
Sore throat                                2
Running Nose                               2
Asthma                                     2
Chronic Lung Disease                       2
Headache                                   2
Heart Disease                              2
Diabetes                                   2
Hyper Tension                              2
Fatigue                                    2
Gastrointestinal                           2
Abroad travel                              2
Contact with COVID Patient                 2
Attended Large Gathering                   2
Visited Public Exposed Places              2
Family working in Public Exposed Places    2
Wearing Masks                              1
Sanitization from Market                   1
COVID-19                                   2
dtype: int64
In [6]:
data.isna().sum()
Out[6]:
Breathing Problem                          0
Fever                                      0
Dry Cough                                  0
Sore throat                                0
Running Nose                               0
Asthma                                     0
Chronic Lung Disease                       0
Headache                                   0
Heart Disease                              0
Diabetes                                   0
Hyper Tension                              0
Fatigue                                    0
Gastrointestinal                           0
Abroad travel                              0
Contact with COVID Patient                 0
Attended Large Gathering                   0
Visited Public Exposed Places              0
Family working in Public Exposed Places    0
Wearing Masks                              0
Sanitization from Market                   0
COVID-19                                   0
dtype: int64
In [7]:
for i in data.columns:
    print(data[i].value_counts(),"\n")
Yes    3620
No     1814
Name: Breathing Problem, dtype: int64 

Yes    4273
No     1161
Name: Fever, dtype: int64 

Yes    4307
No     1127
Name: Dry Cough, dtype: int64 

Yes    3953
No     1481
Name: Sore throat, dtype: int64 

Yes    2952
No     2482
Name: Running Nose, dtype: int64 

No     2920
Yes    2514
Name: Asthma, dtype: int64 

No     2869
Yes    2565
Name: Chronic Lung Disease, dtype: int64 

Yes    2736
No     2698
Name: Headache, dtype: int64 

No     2911
Yes    2523
Name: Heart Disease, dtype: int64 

No     2846
Yes    2588
Name: Diabetes, dtype: int64 

No     2771
Yes    2663
Name: Hyper Tension, dtype: int64 

Yes    2821
No     2613
Name: Fatigue , dtype: int64 

No     2883
Yes    2551
Name: Gastrointestinal , dtype: int64 

No     2983
Yes    2451
Name: Abroad travel, dtype: int64 

Yes    2726
No     2708
Name: Contact with COVID Patient, dtype: int64 

No     2924
Yes    2510
Name: Attended Large Gathering, dtype: int64 

Yes    2820
No     2614
Name: Visited Public Exposed Places, dtype: int64 

No     3172
Yes    2262
Name: Family working in Public Exposed Places, dtype: int64 

No    5434
Name: Wearing Masks, dtype: int64 

No    5434
Name: Sanitization from Market, dtype: int64 

Yes    4383
No     1051
Name: COVID-19, dtype: int64 

Data Visualization¶

In [8]:
px.histogram(data, x="COVID-19", color= 'COVID-19') 
In [9]:
fig = go.Figure(data=[go.Pie(labels=data["COVID-19"].value_counts().index,values= data["COVID-19"].value_counts().values ,hole=.3, pull=[0.1,0.1],textinfo='label+percent')])

                      
fig.update_layout(

    title = "COVID_19 percentage of cases"

    
)

fig.show()
In [10]:
px.histogram(data, x='Breathing Problem', color = 'COVID-19') 
In [11]:
px.histogram(data, x='Fever', color = 'COVID-19') 
In [12]:
px.histogram(data, x='Dry Cough', color = 'COVID-19') 
In [13]:
px.histogram(data, x='Sore throat', color = 'COVID-19') 
In [14]:
data.columns
Out[14]:
Index(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
       'Running Nose', 'Asthma', 'Chronic Lung Disease', 'Headache',
       'Heart Disease', 'Diabetes', 'Hyper Tension', 'Fatigue ',
       'Gastrointestinal ', 'Abroad travel', 'Contact with COVID Patient',
       'Attended Large Gathering', 'Visited Public Exposed Places',
       'Family working in Public Exposed Places', 'Wearing Masks',
       'Sanitization from Market', 'COVID-19'],
      dtype='object')
In [ ]:
 
In [ ]:
 
In [15]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5434 entries, 0 to 5433
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   Breathing Problem                        5434 non-null   object
 1   Fever                                    5434 non-null   object
 2   Dry Cough                                5434 non-null   object
 3   Sore throat                              5434 non-null   object
 4   Running Nose                             5434 non-null   object
 5   Asthma                                   5434 non-null   object
 6   Chronic Lung Disease                     5434 non-null   object
 7   Headache                                 5434 non-null   object
 8   Heart Disease                            5434 non-null   object
 9   Diabetes                                 5434 non-null   object
 10  Hyper Tension                            5434 non-null   object
 11  Fatigue                                  5434 non-null   object
 12  Gastrointestinal                         5434 non-null   object
 13  Abroad travel                            5434 non-null   object
 14  Contact with COVID Patient               5434 non-null   object
 15  Attended Large Gathering                 5434 non-null   object
 16  Visited Public Exposed Places            5434 non-null   object
 17  Family working in Public Exposed Places  5434 non-null   object
 18  Wearing Masks                            5434 non-null   object
 19  Sanitization from Market                 5434 non-null   object
 20  COVID-19                                 5434 non-null   object
dtypes: object(21)
memory usage: 891.6+ KB
In [16]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
In [17]:
data['Breathing Problem']=le.fit_transform(data['Breathing Problem'])
data['Fever']=le.fit_transform(data['Fever'])
data['Gastrointestinal ']=le.fit_transform(data['Gastrointestinal '])
data['Contact with COVID Patient']=le.fit_transform(data['Contact with COVID Patient'])
data['Attended Large Gathering']=le.fit_transform(data['Attended Large Gathering'])
data['Visited Public Exposed Places']=le.fit_transform(data['Visited Public Exposed Places'])
data['Chronic Lung Disease']=le.fit_transform(data['Chronic Lung Disease'])
data['Headache']=le.fit_transform(data['Headache'])
data['Fatigue ']=le.fit_transform(data['Fatigue '])
data['Running Nose']=le.fit_transform(data['Running Nose'])
data['Asthma']=le.fit_transform(data['Asthma'])
data['Family working in Public Exposed Places']=le.fit_transform(data['Family working in Public Exposed Places'])
data['Wearing Masks']=le.fit_transform(data['Wearing Masks'])
data['Sanitization from Market']=le.fit_transform(data['Sanitization from Market'])
data['COVID-19']=le.fit_transform(data['COVID-19'])
data['Heart Disease']=le.fit_transform(data['Heart Disease'])
data['Diabetes']=le.fit_transform(data['Diabetes'])
data['Hyper Tension']=le.fit_transform(data['Hyper Tension'])
data['Abroad travel']=le.fit_transform(data['Abroad travel'])
data['Dry Cough']=le.fit_transform(data['Dry Cough'])
data['Sore throat']=le.fit_transform(data['Sore throat'])
data['Dry Cough']=le.fit_transform(data['Dry Cough'])
data['Sore throat']=le.fit_transform(data['Sore throat'])
In [18]:
data.head()
Out[18]:
Breathing Problem Fever Dry Cough Sore throat Running Nose Asthma Chronic Lung Disease Headache Heart Disease Diabetes ... Fatigue Gastrointestinal Abroad travel Contact with COVID Patient Attended Large Gathering Visited Public Exposed Places Family working in Public Exposed Places Wearing Masks Sanitization from Market COVID-19
0 1 1 1 1 1 0 0 0 0 1 ... 1 1 0 1 0 1 1 0 0 1
1 1 1 1 1 0 1 1 1 0 0 ... 1 0 0 0 1 1 0 0 0 1
2 1 1 1 1 1 1 1 1 0 1 ... 1 1 1 0 0 0 0 0 0 1
3 1 1 1 0 0 1 0 0 1 1 ... 0 0 1 0 1 1 0 0 0 1
4 1 1 1 1 1 0 1 1 1 1 ... 0 1 0 1 0 1 0 0 0 1

5 rows × 21 columns

In [19]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5434 entries, 0 to 5433
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype
---  ------                                   --------------  -----
 0   Breathing Problem                        5434 non-null   int32
 1   Fever                                    5434 non-null   int32
 2   Dry Cough                                5434 non-null   int64
 3   Sore throat                              5434 non-null   int64
 4   Running Nose                             5434 non-null   int32
 5   Asthma                                   5434 non-null   int32
 6   Chronic Lung Disease                     5434 non-null   int32
 7   Headache                                 5434 non-null   int32
 8   Heart Disease                            5434 non-null   int32
 9   Diabetes                                 5434 non-null   int32
 10  Hyper Tension                            5434 non-null   int32
 11  Fatigue                                  5434 non-null   int32
 12  Gastrointestinal                         5434 non-null   int32
 13  Abroad travel                            5434 non-null   int32
 14  Contact with COVID Patient               5434 non-null   int32
 15  Attended Large Gathering                 5434 non-null   int32
 16  Visited Public Exposed Places            5434 non-null   int32
 17  Family working in Public Exposed Places  5434 non-null   int32
 18  Wearing Masks                            5434 non-null   int32
 19  Sanitization from Market                 5434 non-null   int32
 20  COVID-19                                 5434 non-null   int32
dtypes: int32(19), int64(2)
memory usage: 488.3 KB

Data Analysis¶

In [20]:
data.describe(include="all")
Out[20]:
Breathing Problem Fever Dry Cough Sore throat Running Nose Asthma Chronic Lung Disease Headache Heart Disease Diabetes ... Fatigue Gastrointestinal Abroad travel Contact with COVID Patient Attended Large Gathering Visited Public Exposed Places Family working in Public Exposed Places Wearing Masks Sanitization from Market COVID-19
count 5434.000000 5434.000000 5434.000000 5434.000000 5434.000000 5434.000000 5434.000000 5434.000000 5434.000000 5434.000000 ... 5434.000000 5434.000000 5434.000000 5434.000000 5434.000000 5434.000000 5434.000000 5434.0 5434.0 5434.000000
mean 0.666176 0.786345 0.792602 0.727457 0.543246 0.462643 0.472028 0.503497 0.464299 0.476261 ... 0.519139 0.469452 0.451049 0.501656 0.461907 0.518955 0.416268 0.0 0.0 0.806588
std 0.471621 0.409924 0.405480 0.445309 0.498172 0.498648 0.499263 0.500034 0.498770 0.499482 ... 0.499680 0.499112 0.497644 0.500043 0.498593 0.499687 0.492984 0.0 0.0 0.395009
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.000000
25% 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 1.000000
50% 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 ... 1.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.0 0.0 1.000000
75% 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 0.0 1.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 0.0 1.000000

8 rows × 21 columns

In [21]:
data=data.drop('Sanitization from Market',axis=1)
data=data.drop('Wearing Masks',axis=1)
In [22]:
data.hist(figsize=(20,20),bins=3);

Correlation heatmap¶

In [23]:
data.corr()
Out[23]:
Breathing Problem Fever Dry Cough Sore throat Running Nose Asthma Chronic Lung Disease Headache Heart Disease Diabetes Hyper Tension Fatigue Gastrointestinal Abroad travel Contact with COVID Patient Attended Large Gathering Visited Public Exposed Places Family working in Public Exposed Places COVID-19
Breathing Problem 1.000000 0.089903 0.159562 0.303768 0.055190 0.075318 -0.098291 -0.062172 -0.073366 0.055427 0.045256 0.000561 -0.075390 0.117795 0.214634 0.200304 0.066688 0.018295 0.443764
Fever 0.089903 1.000000 0.127580 0.322235 0.081758 0.073953 -0.025160 -0.035416 -0.031462 0.050286 0.079001 -0.060458 -0.008067 0.128726 0.164704 0.070490 0.002252 0.012102 0.352891
Dry Cough 0.159562 0.127580 1.000000 0.213907 -0.030763 0.086843 -0.043664 -0.035912 0.047566 -0.006593 0.081989 -0.039909 0.008251 0.331418 0.128330 0.117963 0.086176 0.163102 0.464292
Sore throat 0.303768 0.322235 0.213907 1.000000 0.039450 0.081377 -0.050440 -0.015971 0.002177 0.001938 0.042811 -0.023290 0.025886 0.205986 0.189251 0.216438 0.079055 0.104378 0.502848
Running Nose 0.055190 0.081758 -0.030763 0.039450 1.000000 -0.022763 -0.014376 0.068479 -0.056750 0.042961 -0.020445 0.007026 -0.014673 0.034526 0.003776 0.061099 0.032568 -0.061323 -0.005657
Asthma 0.075318 0.073953 0.086843 0.081377 -0.022763 1.000000 -0.033771 0.037064 0.076783 -0.012060 0.017707 0.006564 0.101909 0.068286 0.005046 -0.044592 0.020941 -0.115679 0.089930
Chronic Lung Disease -0.098291 -0.025160 -0.043664 -0.050440 -0.014376 -0.033771 1.000000 -0.050480 -0.039860 0.046789 -0.010331 -0.047655 -0.050333 -0.088854 -0.062482 -0.020548 -0.093049 0.038343 -0.056837
Headache -0.062172 -0.035416 -0.035912 -0.015971 0.068479 0.037064 -0.050480 1.000000 0.048471 0.032390 -0.207489 0.052035 0.097778 0.043589 -0.082101 -0.162992 -0.005790 -0.012625 -0.027793
Heart Disease -0.073366 -0.031462 0.047566 0.002177 -0.056750 0.076783 -0.039860 0.048471 1.000000 -0.032956 0.049139 -0.058925 0.004121 -0.020761 -0.025593 -0.045437 0.086169 0.035000 0.027072
Diabetes 0.055427 0.050286 -0.006593 0.001938 0.042961 -0.012060 0.046789 0.032390 -0.032956 1.000000 0.042543 -0.043903 0.040651 0.039013 -0.085696 -0.061650 -0.078212 0.097696 0.040627
Hyper Tension 0.045256 0.079001 0.081989 0.042811 -0.020445 0.017707 -0.010331 -0.207489 0.049139 0.042543 1.000000 -0.027605 -0.067972 -0.016382 0.027307 0.002911 0.019174 0.048152 0.102575
Fatigue 0.000561 -0.060458 -0.039909 -0.023290 0.007026 0.006564 -0.047655 0.052035 -0.058925 -0.043903 -0.027605 1.000000 0.009356 -0.068401 -0.027383 -0.031058 -0.009562 -0.025623 -0.044188
Gastrointestinal -0.075390 -0.008067 0.008251 0.025886 -0.014673 0.101909 -0.050333 0.097778 0.004121 0.040651 -0.067972 0.009356 1.000000 0.099577 0.025277 -0.017251 -0.061885 -0.027603 -0.003367
Abroad travel 0.117795 0.128726 0.331418 0.205986 0.034526 0.068286 -0.088854 0.043589 -0.020761 0.039013 -0.016382 -0.068401 0.099577 1.000000 0.080210 0.113399 0.069609 0.143094 0.443875
Contact with COVID Patient 0.214634 0.164704 0.128330 0.189251 0.003776 0.005046 -0.062482 -0.082101 -0.025593 -0.085696 0.027307 -0.027383 0.025277 0.080210 1.000000 0.234649 0.079800 0.006909 0.357122
Attended Large Gathering 0.200304 0.070490 0.117963 0.216438 0.061099 -0.044592 -0.020548 -0.162992 -0.045437 -0.061650 0.002911 -0.031058 -0.017251 0.113399 0.234649 1.000000 0.083795 0.063776 0.390145
Visited Public Exposed Places 0.066688 0.002252 0.086176 0.079055 0.032568 0.020941 -0.093049 -0.005790 0.086169 -0.078212 0.019174 -0.009562 -0.061885 0.069609 0.079800 0.083795 1.000000 0.028486 0.119755
Family working in Public Exposed Places 0.018295 0.012102 0.163102 0.104378 -0.061323 -0.115679 0.038343 -0.012625 0.035000 0.097696 0.048152 -0.025623 -0.027603 0.143094 0.006909 0.063776 0.028486 1.000000 0.160208
COVID-19 0.443764 0.352891 0.464292 0.502848 -0.005657 0.089930 -0.056837 -0.027793 0.027072 0.040627 0.102575 -0.044188 -0.003367 0.443875 0.357122 0.390145 0.119755 0.160208 1.000000
In [24]:
fig, ax = plt.subplots(figsize=(10,8))    
ax = sns.heatmap(data.corr(), cmap="rocket")

Feature Selection¶

In [25]:
data=data.drop('Running Nose',axis=1)
data=data.drop('Chronic Lung Disease',axis=1)
data=data.drop('Headache',axis=1)
data=data.drop('Heart Disease',axis=1)
data=data.drop('Diabetes',axis=1)
data=data.drop('Gastrointestinal ',axis=1)
data=data.drop('Asthma',axis=1)
In [ ]:
 

Machine Learning¶

Train Test Split¶

In [26]:
x=data.drop('COVID-19',axis=1)
y=data['COVID-19']
In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)
In [28]:
x.columns
Out[28]:
Index(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
       'Hyper Tension', 'Fatigue ', 'Abroad travel',
       'Contact with COVID Patient', 'Attended Large Gathering',
       'Visited Public Exposed Places',
       'Family working in Public Exposed Places'],
      dtype='object')
In [29]:
type(x_train)
Out[29]:
pandas.core.frame.DataFrame

Logistic Regression¶

In [30]:
from sklearn.linear_model import LogisticRegression 
 
log_cls = LogisticRegression(random_state = 10)
log_cls.fit(x_train, y_train)
y_pred = log_cls.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\nF1 Score : ",f1_score(y_test, y_pred))
[[194  18]
 [ 21 854]]

F1 Score :  0.9776760160274756
In [31]:
log_cls.feature_names_in_
Out[31]:
array(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
       'Hyper Tension', 'Fatigue ', 'Abroad travel',
       'Contact with COVID Patient', 'Attended Large Gathering',
       'Visited Public Exposed Places',
       'Family working in Public Exposed Places'], dtype=object)
In [32]:
acc_log_cls = log_cls.score(x_test, y_test)
In [33]:
print("Train Accuracy ",log_cls.score(x_train, y_train))
print("Test Accuracy ",log_cls.score(x_test, y_test))
Train Accuracy  0.9648033126293996
Test Accuracy  0.9641214351425943
In [34]:
#dictionary --> sort --> top 5 or 10 features 
In [35]:
dicta = {}
dicta.clear()
for i in range(0,11):
    # print( classifier.coef_[0][i], " <--- ",x.columns[i]  )
    dicta.update({log_cls.feature_names_in_[i] : abs(log_cls.coef_[0][i])})
    
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
Out[35]:
{'Abroad travel': 5.372912765719051,
 'Attended Large Gathering': 5.077488056601601,
 'Fever': 2.6501414475857326,
 'Sore throat': 2.6466919628879277,
 'Dry Cough': 2.6243375328714,
 'Breathing Problem': 2.496947254454527,
 'Contact with COVID Patient': 1.625022489400047,
 'Family working in Public Exposed Places': 1.0419210208361478,
 'Fatigue ': 0.03643879257919243,
 'Visited Public Exposed Places': 0.036238658197606737,
 'Hyper Tension': 0.014991447317832734}
In [36]:
log_cls.coef_
Out[36]:
array([[ 2.49694725,  2.65014145,  2.62433753,  2.64669196, -0.01499145,
        -0.03643879,  5.37291277,  1.62502249,  5.07748806,  0.03623866,
         1.04192102]])
In [37]:
## Abroad travel, Attended Large Gathering, Fever, Dry Cough, Sore throat, Breathing Problem are the major factors that the logistic regression has considered while training the model
In [38]:
 x.columns
Out[38]:
Index(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
       'Hyper Tension', 'Fatigue ', 'Abroad travel',
       'Contact with COVID Patient', 'Attended Large Gathering',
       'Visited Public Exposed Places',
       'Family working in Public Exposed Places'],
      dtype='object')
In [ ]:
 
In [ ]:
 
In [ ]:
 

SVM¶

In [39]:
## SVM 
from sklearn.svm import SVC  
svm_clf = SVC(kernel='linear') 
  
# fitting x samples and y classes 
svm_clf.fit(x, y) 
Out[39]:
SVC(kernel='linear')
In [40]:
## Features are NOT independent ?
In [41]:
pred = svm_clf.predict(x_test)
# check the accuracy on the training set
print("Train Accuracy ",svm_clf.score(x_train, y_train))
print("Test Accuracy ",svm_clf.score(x_test, y_test))
Train Accuracy  0.9682539682539683
Test Accuracy  0.9696412143514259
In [42]:
acc_svm_clf = svm_clf.score(x_test, y_test)
In [43]:
dicta = {}
dicta.clear()
for i in range(0,11):
    # print( classifier.coef_[0][i], " <--- ",x.columns[i]  )
    dicta.update({svm_clf.feature_names_in_[i] : abs(svm_clf.coef_[0][i])})
    
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
Out[43]:
{'Attended Large Gathering': 3.998881391660475,
 'Abroad travel': 2.000004449747624,
 'Breathing Problem': 1.9996603622461926,
 'Sore throat': 1.9995789626697462,
 'Dry Cough': 1.9995320516416175,
 'Fever': 1.9995181985347399,
 'Family working in Public Exposed Places': 0.00020887689702986378,
 'Contact with COVID Patient': 0.00014710696201447604,
 'Visited Public Exposed Places': 0.00014356045880958845,
 'Fatigue ': 6.209827473835361e-05,
 'Hyper Tension': 5.723246616540223e-05}
In [ ]:
 
In [ ]:
 

Random Forest¶

In [44]:
#Train the model
from sklearn.ensemble import RandomForestClassifier
randf_model = RandomForestClassifier(n_estimators=1000)
#Fit
randf_model.fit(x_train, y_train)
#Score/Accuracy
acc_randomforest=randf_model.score(x_test, y_test)*100
acc_randomforest
Out[44]:
97.42410303587856
In [45]:
acc_randf_model = randf_model.score(x_test, y_test)
In [46]:
print("Train Accuracy ",randf_model.score(x_train, y_train))
print("Test Accuracy ",randf_model.score(x_test, y_test))
Train Accuracy  0.9802162410858063
Test Accuracy  0.9742410303587856
In [47]:
dicta = {}
dicta.clear()
for i in range(0,11):
    # print( classifier.coef_[0][i], " <--- ",x.columns[i]  )
    dicta.update({randf_model.feature_names_in_[i] : abs(randf_model.feature_importances_[i])})
    
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
Out[47]:
{'Breathing Problem': 0.17249560340237732,
 'Abroad travel': 0.16842914501113226,
 'Sore throat': 0.16393784895499064,
 'Dry Cough': 0.13963722944902596,
 'Attended Large Gathering': 0.10549212034101212,
 'Fever': 0.08097135086503054,
 'Contact with COVID Patient': 0.07506301714015166,
 'Family working in Public Exposed Places': 0.026882853602896196,
 'Fatigue ': 0.023774465688318033,
 'Visited Public Exposed Places': 0.022915999513112247,
 'Hyper Tension': 0.02040036603195308}
In [ ]:
 

KNN¶

In [48]:
# KNN

error_rate = []
  
# Will take some time
for i in range(1, 10):
      
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(x_train, y_train)
    pred_i = knn.predict(x_test)
    error_rate.append(np.mean(pred_i != y_test))
  
plt.figure(figsize =(10, 6))
plt.plot(range(1, 10), error_rate, color ='blue',
                linestyle ='dashed', marker ='o',
         markerfacecolor ='red', markersize = 10)
  
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
Out[48]:
Text(0, 0.5, 'Error Rate')
In [ ]:
 
In [49]:
num_neighbor = 6
knn_clf = KNeighborsClassifier(n_neighbors = num_neighbor)
  
knn_clf.fit(x_train, y_train)
pred = knn_clf.predict(x_test)
  
print('WITH K = ',num_neighbor )
print('\n')
print(confusion_matrix(y_test, pred))
print('\n')
print(classification_report(y_test, pred))
WITH K =  6


[[205   7]
 [ 19 856]]


              precision    recall  f1-score   support

           0       0.92      0.97      0.94       212
           1       0.99      0.98      0.99       875

    accuracy                           0.98      1087
   macro avg       0.95      0.97      0.96      1087
weighted avg       0.98      0.98      0.98      1087

In [50]:
acc_knn_clf = knn_clf.score(x_test, y_test)
In [51]:
print("Train Accuracy ",knn_clf.score(x_train, y_train))
print("Test Accuracy ",knn_clf.score(x_test, y_test))
Train Accuracy  0.9786059351276742
Test Accuracy  0.9760809567617296
In [ ]:
 
In [ ]:
 

Decision Tree¶

In [52]:
from sklearn import tree
d_t = tree.DecisionTreeClassifier(criterion= 'gini')
d_t.fit(x_train,y_train)
y_pred = d_t.predict(x_test)
#Score/Accuracy
acc_decisiontree=d_t.score(x_test, y_test)*100
acc_decisiontree
Out[52]:
97.42410303587856
In [53]:
dicta = {}
dicta.clear()
for i in range(0,11):
    # print( classifier.coef_[0][i], " <--- ",x.columns[i]  )
    dicta.update({d_t.feature_names_in_[i] : abs(d_t.feature_importances_[i])})
    
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
Out[53]:
{'Sore throat': 0.2727999178980045,
 'Breathing Problem': 0.22542975014843122,
 'Abroad travel': 0.21362696378121346,
 'Dry Cough': 0.07933788753820542,
 'Attended Large Gathering': 0.07166074650165857,
 'Contact with COVID Patient': 0.0548523014046704,
 'Fever': 0.03560931296377859,
 'Fatigue ': 0.019134689111653308,
 'Visited Public Exposed Places': 0.012846166366675555,
 'Hyper Tension': 0.0075984049723293,
 'Family working in Public Exposed Places': 0.007103859313379582}
In [54]:
acc_d_t = d_t.score(x_test, y_test)
In [55]:
print("Train Accuracy ",d_t.score(x_train, y_train))
print("Test Accuracy ",d_t.score(x_test, y_test))
Train Accuracy  0.9802162410858063
Test Accuracy  0.9742410303587856
In [ ]:
 

Naive Bayes¶

In [56]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(x_train,y_train)
#Score/Accuracy
acc_gaussian= nb_model.score(x_test, y_test)*100
acc_gaussian
Out[56]:
75.71297148114076
In [57]:
acc_nb_model=nb_model.score(x_test, y_test)
In [58]:
print("Train Accuracy ",nb_model.score(x_train, y_train))
print("Test Accuracy ",nb_model.score(x_test, y_test))
Train Accuracy  0.756383712905452
Test Accuracy  0.7571297148114076
In [ ]:
 
In [ ]:
 
In [ ]:
 

XGBoost¶

In [59]:
#XGBoost
#!pip install xgboost
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model.fit(x_train, y_train)
preds = xgb_model.predict(x_test)
#xgb_model.accuracy_score(y_test, preds)
C:\Users\skaks\miniconda3\lib\site-packages\xgboost\compat.py:36: FutureWarning:

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.

C:\Users\skaks\miniconda3\lib\site-packages\xgboost\sklearn.py:1224: UserWarning:

The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].

C:\Users\skaks\miniconda3\lib\site-packages\xgboost\data.py:262: FutureWarning:

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.

[16:16:55] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
In [60]:
print("Train Accuracy ",xgb_model.score(x_train, y_train))
print("Test Accuracy ",xgb_model.score(x_test, y_test))
Train Accuracy  0.9802162410858063
Test Accuracy  0.9742410303587856
C:\Users\skaks\miniconda3\lib\site-packages\xgboost\data.py:262: FutureWarning:

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.

In [61]:
acc_xgb_model = xgb_model.score(x_test, y_test)
C:\Users\skaks\miniconda3\lib\site-packages\xgboost\data.py:262: FutureWarning:

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.

In [ ]:
 
In [ ]:
 
In [62]:
from xgboost import plot_importance
plot_importance(xgb_model, )
Out[62]:
<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>
In [ ]:
 

Adaboost¶

In [63]:
# AdaBoost Algorithm
from sklearn.ensemble import AdaBoostClassifier
adb_model = AdaBoostClassifier()
# n_estimators = 50 (default value) 
# base_estimator = DecisionTreeClassifier (default value)
adb_model.fit(x_train,y_train)
preds = adb_model.predict(x_test)
#adb_model.accuracy_score(y_test, preds)
In [64]:
dicta = {}
dicta.clear()
for i in range(0,11):
    # print( classifier.coef_[0][i], " <--- ",x.columns[i]  )
    dicta.update({adb_model.feature_names_in_[i] : abs(adb_model.feature_importances_[i])})
    
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
Out[64]:
{'Attended Large Gathering': 0.26,
 'Fever': 0.14,
 'Dry Cough': 0.12,
 'Sore throat': 0.12,
 'Contact with COVID Patient': 0.1,
 'Breathing Problem': 0.08,
 'Hyper Tension': 0.06,
 'Family working in Public Exposed Places': 0.06,
 'Fatigue ': 0.02,
 'Abroad travel': 0.02,
 'Visited Public Exposed Places': 0.02}
In [65]:
print("Train Accuracy ",adb_model.score(x_train, y_train))
print("Test Accuracy ",adb_model.score(x_test, y_test))
Train Accuracy  0.9627329192546584
Test Accuracy  0.9613615455381784
In [66]:
acc_adb_model=adb_model.score(x_test, y_test)
In [ ]:
 
In [67]:
t_feat = {'Features': 
          ['Breathing Problem', #1
           'Fever', #2
           'Dry Cough',  #3
           'Sore throat',#4
           'Hyper Tension',#5
           'Fatigue ',#6
           'Abroad travel',#7
           'Contact with COVID Patient',#8
           'Attended Large Gathering',#9
           'Visited Public Exposed Places',#10
           'Family working in Public Exposed Places']#11
          ,
        'Count':[5,2,7,6,1,1,5,0,5,1 ,0]}
              #  1,2,3,4,5,6,7,8,9,10,11
# Create DataFrame
top_feat = pd.DataFrame(t_feat)
top_feat
Out[67]:
Features Count
0 Breathing Problem 5
1 Fever 2
2 Dry Cough 7
3 Sore throat 6
4 Hyper Tension 1
5 Fatigue 1
6 Abroad travel 5
7 Contact with COVID Patient 0
8 Attended Large Gathering 5
9 Visited Public Exposed Places 1
10 Family working in Public Exposed Places 0
In [68]:
fig, ax = plt.subplots(figsize=(10, 8))

# Plot horizontal bar graph
top_feat.sort_values(by='Count').plot.bar(x='Features',
                      y='Count',
                      ax=ax,
                      color="purple")

ax.set_title("Count of features apprearing in the top 5 most important features from all the trained models ")

plt.show()
In [ ]:
 
In [69]:
## ROC curve n AUC (higher area, higher accuracy)
In [70]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot


ns_probs = [0 for _ in range(len(y_test))]
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(x_train, y_train)
# predict probabilities
lr_probs = model.predict_proba(x_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
No Skill: ROC AUC=0.500
Logistic: ROC AUC=0.993
In [71]:
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
from sklearn.metrics import precision_recall_curve

model = LogisticRegression()
model.fit(x_train, y_train)
# predict probabilities
lr_probs = model.predict_proba(x_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = model.predict(x_test)
# calculate precision and recall for each threshold
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
# calculate scores
lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)
# summarize scores
print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
# plot the precision-recall curves
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Logistic: f1=0.978 auc=0.998
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [72]:
from sklearn.neural_network import MLPClassifier
In [73]:
mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp.fit(x_train,y_train)

predict_train = mlp.predict(x_train)
predict_test = mlp.predict(x_test)
In [74]:
print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))
[[ 799   40]
 [  54 3454]]
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       839
           1       0.99      0.98      0.99      3508

    accuracy                           0.98      4347
   macro avg       0.96      0.97      0.97      4347
weighted avg       0.98      0.98      0.98      4347

In [75]:
print("Train Accuracy ",mlp.score(x_train, y_train))
print("Test Accuracy ",mlp.score(x_test, y_test))
Train Accuracy  0.9783758914193696
Test Accuracy  0.9760809567617296
In [76]:
acc_mlp = mlp.score(x_test, y_test)
In [ ]:
 
In [77]:
Knn_pred = knn.predict(x)
In [78]:
Knn_pred
Out[78]:
array([1, 1, 1, ..., 0, 0, 0])
In [79]:
x['KNN_Output'] = Knn_pred
In [80]:
x.head()
Out[80]:
Breathing Problem Fever Dry Cough Sore throat Hyper Tension Fatigue Abroad travel Contact with COVID Patient Attended Large Gathering Visited Public Exposed Places Family working in Public Exposed Places KNN_Output
0 1 1 1 1 1 1 0 1 0 1 1 1
1 1 1 1 1 0 1 0 0 1 1 0 1
2 1 1 1 1 0 1 1 0 0 0 0 1
3 1 1 1 0 0 0 1 0 1 1 0 1
4 1 1 1 1 1 0 0 1 0 1 0 1
In [81]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, test_size = 0.20)
In [82]:
mlp_knn = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp_knn.fit(x_train1,y_train1)

predict_train = mlp_knn.predict(x_train1)
predict_test = mlp_knn.predict(x_test1)
In [83]:
print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))
[[ 157  682]
 [ 688 2820]]
              precision    recall  f1-score   support

           0       0.19      0.19      0.19       839
           1       0.81      0.80      0.80      3508

    accuracy                           0.68      4347
   macro avg       0.50      0.50      0.50      4347
weighted avg       0.69      0.68      0.69      4347

In [84]:
print("Train Accuracy ",mlp_knn.score(x_train1, y_train1))
print("Test Accuracy ",mlp_knn.score(x_test1, y_test1))
Train Accuracy  0.9774557165861514
Test Accuracy  0.9760809567617296
In [85]:
acc_mlp_knn = mlp_knn.score(x_test1, y_test1)
In [ ]:
 
In [86]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression','Random Forest', 'Naive Bayes', 'Decision Tree', 'XGBoost','AdaBoost', 'MLP Classifier','MLP with KNN as Feature'],
    'Score': [acc_svm_clf,
              acc_knn_clf,
              acc_log_cls,
              acc_randf_model,
              acc_nb_model,
              acc_d_t,
              acc_xgb_model,
              acc_adb_model,
              acc_mlp,
              acc_mlp_knn]})
models.sort_values(by='Score', ascending=False)
Out[86]:
Model Score
1 KNN 0.976081
8 MLP Classifier 0.976081
9 MLP with KNN as Feature 0.976081
3 Random Forest 0.974241
5 Decision Tree 0.974241
6 XGBoost 0.974241
0 Support Vector Machines 0.969641
2 Logistic Regression 0.964121
7 AdaBoost 0.961362
4 Naive Bayes 0.757130
In [ ]:
 
In [ ]:
 
In [ ]: